Introduction

The purpose of this notebook is to give data locations, data ingestion code, and code for rudimentary analysis and visualization of COVID-19 data provided by New York Times, [NYT1].

The following steps are taken:

Note that other, older repositories with COVID-19 data exist, like, [JH1, VK1].

Remark: The time series section is done for illustration purposes only. The forecasts there should not be taken seriously.

Preliminary defintions

From the help of tolower:

capwords <- function(s, strict = FALSE) {
    cap <- function(s) paste(toupper(substring(s, 1, 1)),
                  {s <- substring(s, 2); if(strict) tolower(s) else s},
                             sep = "", collapse = " " )
    sapply(strsplit(s,  split = " "), cap, USE.NAMES = !is.null(names(s)))
}

Import data

NYTimes USA states data

if( !exists("dfNYDataStates") ) {
  dfNYDataStates <- read.csv( "https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv", stringsAsFactors = FALSE )
  colnames(dfNYDataStates) <- capwords(colnames(dfNYDataStates))
}
head(dfNYDataStates)
dfNYDataStates$DateObject <- as.POSIXct(dfNYDataStates$Date)
summary(as.data.frame(unclass(dfNYDataStates)))
         Date                State           Fips           Cases            Deaths          DateObject                 
 2020-04-09:  56   Washington   :  83   Min.   : 1.00   Min.   :     0   Min.   :   0.00   Min.   :2020-01-21 00:00:00  
 2020-04-10:  56   Illinois     :  80   1st Qu.:17.00   1st Qu.:    11   1st Qu.:   0.00   1st Qu.:2020-03-12 00:00:00  
 2020-04-11:  56   California   :  79   Median :31.00   Median :   144   Median :   2.00   Median :2020-03-23 00:00:00  
 2020-04-12:  56   Arizona      :  78   Mean   :31.11   Mean   :  2515   Mean   :  75.63   Mean   :2020-03-20 12:22:07  
 2020-03-28:  55   Massachusetts:  72   3rd Qu.:46.00   3rd Qu.:  1066   3rd Qu.:  21.00   3rd Qu.:2020-04-02 00:00:00  
 2020-03-29:  55   Wisconsin    :  68   Max.   :78.00   Max.   :188694   Max.   :9385.00   Max.   :2020-04-12 00:00:00  
 (Other)   :1939   (Other)      :1813                                                                                   

NYTimes USA counties data

if(!exists("dfNYDataCounties") ) {
  dfNYDataCounties <- read.csv( "https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv", stringsAsFactors = FALSE )
  colnames(dfNYDataCounties) <- capwords(colnames(dfNYDataCounties))
}
head(dfNYDataCounties)
dfNYDataCounties$DateObject <- as.POSIXct(dfNYDataCounties$Date)
summary(as.data.frame(unclass(dfNYDataCounties)))
         Date              County                 State            Fips           Cases              Deaths           DateObject                 
 2020-04-12: 2679   Washington:  682   Georgia       : 3182   Min.   : 1001   Min.   :     0.0   Min.   :   0.000   Min.   :2020-01-21 00:00:00  
 2020-04-11: 2660   Unknown   :  649   Texas         : 3134   1st Qu.:17161   1st Qu.:     2.0   1st Qu.:   0.000   1st Qu.:2020-03-26 00:00:00  
 2020-04-10: 2629   Jefferson :  519   Virginia      : 2205   Median :28135   Median :     5.0   Median :   0.000   Median :2020-04-02 00:00:00  
 2020-04-09: 2595   Franklin  :  478   Indiana       : 1868   Mean   :29529   Mean   :   106.2   Mean   :   3.179   Mean   :2020-03-31 07:50:28  
 2020-04-08: 2565   Jackson   :  423   North Carolina: 1855   3rd Qu.:42125   3rd Qu.:    22.0   3rd Qu.:   1.000   3rd Qu.:2020-04-07 00:00:00  
 2020-04-07: 2539   Montgomery:  398   Tennessee     : 1813   Max.   :56043   Max.   :103208.0   Max.   :6717.000   Max.   :2020-04-12 00:00:00  
 (Other)   :38181   (Other)   :50699   (Other)       :39791   NA's   :716                                                                        

US county records

if(!exists("dfUSACountyData")){
  dfUSACountyData <- read.csv( "https://raw.githubusercontent.com/antononcube/SystemModeling/master/Data/dfUSACountyRecords.csv", stringsAsFactors = FALSE )
}
head(dfUSACountyData)
summary(as.data.frame(unclass(dfUSACountyData)))
         Country          State                   County          FIPS         Population            Lat             Lon         
 UnitedStates:3143   Texas   : 254   WashingtonCounty:  30   Min.   : 1001   Min.   :      89   Min.   :19.60   Min.   :-166.90  
                     Georgia : 159   JeffersonCounty :  25   1st Qu.:18178   1st Qu.:   10980   1st Qu.:34.70   1st Qu.: -98.23  
                     Virginia: 134   FranklinCounty  :  24   Median :29177   Median :   25690   Median :38.37   Median : -90.40  
                     Kentucky: 120   JacksonCounty   :  23   Mean   :30390   Mean   :  102248   Mean   :38.46   Mean   : -92.28  
                     Missouri: 115   LincolnCounty   :  23   3rd Qu.:45082   3rd Qu.:   67507   3rd Qu.:41.81   3rd Qu.: -83.43  
                     Kansas  : 105   MadisonCounty   :  19   Max.   :56045   Max.   :10170292   Max.   :69.30   Max.   : -67.63  
                     (Other) :2256   (Other)         :2999                                                                       

Merge data

dsNYDataCountiesExtended <- 
  dfNYDataCounties %>% 
  dplyr::inner_join( dfUSACountyData %>% dplyr::select_at( .vars = c("FIPS", "Lat", "Lon", "Population") ), by = c( "Fips" = "FIPS" ) )
dsNYDataCountiesExtended

Basic data analysis

ParetoPlotForColumns( dsNYDataCountiesExtended, c("Cases", "Deaths"), scales = "free" )

Geo-histogram

Leaflet

cf <- colorBin( palette = "Reds", domain = log10(dsNYDataCountiesExtended$Cases), bins = 10 )
m <- 
  leaflet( dsNYDataCountiesExtended[, c("Lat", "Lon", "Cases")] ) %>%
  addTiles() %>% 
  addCircleMarkers( ~Lon, ~Lat, radius = ~ log10(Cases), fillColor = ~ cf(log10(Cases)), color = ~ cf(log10(Cases)), fillOpacity = 0.8, stroke = FALSE, popup = ~Cases )
m

Heat-map plots

An alternative of the geo-visualization is to use a heat-map plot.

Cases

Make a heat-map plot by sorting the rows of the cross-tabulation matrix (that correspond to states):

matSDC <- xtabs( Cases ~ State + Date, dfNYDataStates, sparse = TRUE)
d3heatmap::d3heatmap( log10(matSDC+1), cellnote = as.matrix(matSDC), scale = "none", dendrogram = "row", colors = "Blues") #, theme = "dark")

Deaths

Cross-tabulate states with dates over deaths and plot:

matSDD <- xtabs( Deaths ~ State + Date, dfNYDataStates, sparse = TRUE)
d3heatmap::d3heatmap( log10(matSDD+1), cellnote = as.matrix(matSDD), scale = "none", dendrogram = "row", colors = "Blues") #, theme = "dark")

Time series

dfQuery <- 
  dfNYDataStates %>% 
  dplyr::group_by( Date, DateObject ) %>% 
  dplyr::summarise_at( .vars = c("Cases", "Deaths"), .funs = sum )
dfQueryLongForm <- tidyr::pivot_longer( dfQuery, cols = c("Cases", "Deaths"), names_to = "Variable", values_to = "Value")
dfQueryLongForm
ggplot(dfQueryLongForm) +
  geom_line( aes( x = DateObject, y = log10(Value) ) ) +
  facet_wrap( ~Variable, ncol = 1 )

Cases

fit <- forecast::auto.arima( dfQuery$Cases )
fit
Series: dfQuery$Cases 
ARIMA(0,2,0) 

sigma^2 estimated as 3119674:  log likelihood=-720.54
AIC=1443.08   AICc=1443.13   BIC=1445.47
plot( forecast::forecast(fit, h = 20) )
grid(nx = NULL, ny = NULL, col = "lightgray", lty = "dotted")

Deaths

fit <- forecast::auto.arima( dfQuery$Deaths )
fit
Series: dfQuery$Deaths 
ARIMA(1,2,0) 

Coefficients:
          ar1
      -0.3135
s.e.   0.1100

sigma^2 estimated as 18846:  log likelihood=-513.17
AIC=1030.33   AICc=1030.49   BIC=1035.12
plot( forecast::forecast(fit, h = 20) )
grid(nx = NULL, ny = NULL, col = "lightgray", lty = "dotted")

LS0tCnRpdGxlOiAiUiBOb3RlYm9vayIKYXV0aG9yOiBBbnRvbiBBbnRvbm92CmRhdGU6IDIwMjAtMDMtMzAKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKYGBge3IsIGVjaG89RkFMU0V9CmxpYnJhcnkoZHBseXIpCmxpYnJhcnkoZ2dwbG90MikKbGlicmFyeShsZWFmbGV0KQpsaWJyYXJ5KGQzaGVhdG1hcCkKbGlicmFyeShQYXJldG9QcmluY2lwbGVBZGhlcmVuY2UpCmxpYnJhcnkobHVicmlkYXRlKQpsaWJyYXJ5KGZvcmVjYXN0KQpgYGAKCiMgSW50cm9kdWN0aW9uCgpUaGUgcHVycG9zZSBvZiB0aGlzIG5vdGVib29rIGlzIHRvIGdpdmUgZGF0YSBsb2NhdGlvbnMsIGRhdGEgaW5nZXN0aW9uIGNvZGUsIGFuZCBjb2RlIGZvciBydWRpbWVudGFyeSBhbmFseXNpcyBhbmQgdmlzdWFsaXphdGlvbiBvZiBDT1ZJRC0xOSBkYXRhIHByb3ZpZGVkIGJ5IE5ldyBZb3JrIFRpbWVzLCBbTllUMV0uIAoKVGhlIGZvbGxvd2luZyBzdGVwcyBhcmUgdGFrZW46CgotIEluZ2VzdCBkYXRhCgogIC0gVGFrZSBDT1ZJRC0xOSBkYXRhIGZyb20gVGhlIE5ldyBZb3JrIFRpbWVzLCBiYXNlZCBvbiByZXBvcnRzIGZyb20gc3RhdGUgYW5kIGxvY2FsIGhlYWx0aCBhZ2VuY2llcywgW05ZVDFdLgoKICAtIFRha2UgVVNBIGNvdW50aWVzIHJlY29yZHMgZGF0YSAoRklQUyBjb2RlcywgZ2VvLWNvb3JkaW5hdGVzLCBwb3B1bGF0aW9ucyksIFtXUkkxXS4KCi0gTWVyZ2UgdGhlIGRhdGEuCgotIE1ha2UgZGF0YSBzdW1tYXJpZXMgYW5kIHJlbGF0ZWQgcGxvdHMuCgotIE1ha2UgY29ycmVzcG9uZGluZyBnZW8tcGxvdHMuCgpOb3RlIHRoYXQgb3RoZXIsIG9sZGVyIHJlcG9zaXRvcmllcyB3aXRoIENPVklELTE5IGRhdGEgZXhpc3QsIGxpa2UsIFtKSDEsIFZLMV0uCgoqUmVtYXJrOiogVGhlIHRpbWUgc2VyaWVzIHNlY3Rpb24gaXMgZG9uZSBmb3IgaWxsdXN0cmF0aW9uIHB1cnBvc2VzIG9ubHkuIFRoZSBmb3JlY2FzdHMgdGhlcmUgc2hvdWxkIG5vdCBiZSB0YWtlbiBzZXJpb3VzbHkuCgojIFByZWxpbWluYXJ5IGRlZmludGlvbnMKCkZyb20gdGhlIGhlbHAgb2YgYHRvbG93ZXJgOgoKYGBge3J9CmNhcHdvcmRzIDwtIGZ1bmN0aW9uKHMsIHN0cmljdCA9IEZBTFNFKSB7CiAgICBjYXAgPC0gZnVuY3Rpb24ocykgcGFzdGUodG91cHBlcihzdWJzdHJpbmcocywgMSwgMSkpLAogICAgICAgICAgICAgICAgICB7cyA8LSBzdWJzdHJpbmcocywgMik7IGlmKHN0cmljdCkgdG9sb3dlcihzKSBlbHNlIHN9LAogICAgICAgICAgICAgICAgICAgICAgICAgICAgIHNlcCA9ICIiLCBjb2xsYXBzZSA9ICIgIiApCiAgICBzYXBwbHkoc3Ryc3BsaXQocywgIHNwbGl0ID0gIiAiKSwgY2FwLCBVU0UuTkFNRVMgPSAhaXMubnVsbChuYW1lcyhzKSkpCn0KYGBgCgojIEltcG9ydCBkYXRhCgojIyBOWVRpbWVzIFVTQSBzdGF0ZXMgZGF0YQoKYGBge3J9CmlmKCAhZXhpc3RzKCJkZk5ZRGF0YVN0YXRlcyIpICkgewogIGRmTllEYXRhU3RhdGVzIDwtIHJlYWQuY3N2KCAiaHR0cHM6Ly9yYXcuZ2l0aHVidXNlcmNvbnRlbnQuY29tL255dGltZXMvY292aWQtMTktZGF0YS9tYXN0ZXIvdXMtc3RhdGVzLmNzdiIsIHN0cmluZ3NBc0ZhY3RvcnMgPSBGQUxTRSApCiAgY29sbmFtZXMoZGZOWURhdGFTdGF0ZXMpIDwtIGNhcHdvcmRzKGNvbG5hbWVzKGRmTllEYXRhU3RhdGVzKSkKfQpoZWFkKGRmTllEYXRhU3RhdGVzKQpgYGAKCmBgYHtyfQpkZk5ZRGF0YVN0YXRlcyREYXRlT2JqZWN0IDwtIGFzLlBPU0lYY3QoZGZOWURhdGFTdGF0ZXMkRGF0ZSkKYGBgCgpgYGB7cn0Kc3VtbWFyeShhcy5kYXRhLmZyYW1lKHVuY2xhc3MoZGZOWURhdGFTdGF0ZXMpKSkKYGBgCgoKIyMgTllUaW1lcyBVU0EgY291bnRpZXMgZGF0YQoKYGBge3J9CmlmKCFleGlzdHMoImRmTllEYXRhQ291bnRpZXMiKSApIHsKICBkZk5ZRGF0YUNvdW50aWVzIDwtIHJlYWQuY3N2KCAiaHR0cHM6Ly9yYXcuZ2l0aHVidXNlcmNvbnRlbnQuY29tL255dGltZXMvY292aWQtMTktZGF0YS9tYXN0ZXIvdXMtY291bnRpZXMuY3N2Iiwgc3RyaW5nc0FzRmFjdG9ycyA9IEZBTFNFICkKICBjb2xuYW1lcyhkZk5ZRGF0YUNvdW50aWVzKSA8LSBjYXB3b3Jkcyhjb2xuYW1lcyhkZk5ZRGF0YUNvdW50aWVzKSkKfQpoZWFkKGRmTllEYXRhQ291bnRpZXMpCmBgYAoKYGBge3J9CmRmTllEYXRhQ291bnRpZXMkRGF0ZU9iamVjdCA8LSBhcy5QT1NJWGN0KGRmTllEYXRhQ291bnRpZXMkRGF0ZSkKYGBgCgpgYGB7cn0Kc3VtbWFyeShhcy5kYXRhLmZyYW1lKHVuY2xhc3MoZGZOWURhdGFDb3VudGllcykpKQpgYGAKCiMjIFVTIGNvdW50eSByZWNvcmRzCgpgYGB7cn0KaWYoIWV4aXN0cygiZGZVU0FDb3VudHlEYXRhIikpewogIGRmVVNBQ291bnR5RGF0YSA8LSByZWFkLmNzdiggImh0dHBzOi8vcmF3LmdpdGh1YnVzZXJjb250ZW50LmNvbS9hbnRvbm9uY3ViZS9TeXN0ZW1Nb2RlbGluZy9tYXN0ZXIvRGF0YS9kZlVTQUNvdW50eVJlY29yZHMuY3N2Iiwgc3RyaW5nc0FzRmFjdG9ycyA9IEZBTFNFICkKfQpoZWFkKGRmVVNBQ291bnR5RGF0YSkKYGBgCgpgYGB7cn0Kc3VtbWFyeShhcy5kYXRhLmZyYW1lKHVuY2xhc3MoZGZVU0FDb3VudHlEYXRhKSkpCmBgYAoKIyBNZXJnZSBkYXRhCgpgYGB7cn0KZHNOWURhdGFDb3VudGllc0V4dGVuZGVkIDwtIAogIGRmTllEYXRhQ291bnRpZXMgJT4lIAogIGRwbHlyOjppbm5lcl9qb2luKCBkZlVTQUNvdW50eURhdGEgJT4lIGRwbHlyOjpzZWxlY3RfYXQoIC52YXJzID0gYygiRklQUyIsICJMYXQiLCAiTG9uIiwgIlBvcHVsYXRpb24iKSApLCBieSA9IGMoICJGaXBzIiA9ICJGSVBTIiApICkKZHNOWURhdGFDb3VudGllc0V4dGVuZGVkCmBgYAoKCiMgQmFzaWMgZGF0YSBhbmFseXNpcwoKYGBge3J9ClBhcmV0b1Bsb3RGb3JDb2x1bW5zKCBkc05ZRGF0YUNvdW50aWVzRXh0ZW5kZWQsIGMoIkNhc2VzIiwgIkRlYXRocyIpLCBzY2FsZXMgPSAiZnJlZSIgKQpgYGAKCiMgR2VvLWhpc3RvZ3JhbQoKIyMgTGVhZmxldAoKYGBge3J9CmNmIDwtIGNvbG9yQmluKCBwYWxldHRlID0gIlJlZHMiLCBkb21haW4gPSBsb2cxMChkc05ZRGF0YUNvdW50aWVzRXh0ZW5kZWQkQ2FzZXMpLCBiaW5zID0gMTAgKQpgYGAKCmBgYHtyfQptIDwtIAogIGxlYWZsZXQoIGRzTllEYXRhQ291bnRpZXNFeHRlbmRlZFssIGMoIkxhdCIsICJMb24iLCAiQ2FzZXMiKV0gKSAlPiUKICBhZGRUaWxlcygpICU+JSAKICBhZGRDaXJjbGVNYXJrZXJzKCB+TG9uLCB+TGF0LCByYWRpdXMgPSB+IGxvZzEwKENhc2VzKSwgZmlsbENvbG9yID0gfiBjZihsb2cxMChDYXNlcykpLCBjb2xvciA9IH4gY2YobG9nMTAoQ2FzZXMpKSwgZmlsbE9wYWNpdHkgPSAwLjgsIHN0cm9rZSA9IEZBTFNFLCBwb3B1cCA9IH5DYXNlcyApCm0KYGBgCgoKIyBIZWF0LW1hcCBwbG90cwoKQW4gYWx0ZXJuYXRpdmUgb2YgdGhlIGdlby12aXN1YWxpemF0aW9uIGlzIHRvIHVzZSBhIGhlYXQtbWFwIHBsb3QuCgoKIyMgQ2FzZXMKCk1ha2UgYSBoZWF0LW1hcCBwbG90IGJ5IHNvcnRpbmcgdGhlIHJvd3Mgb2YgdGhlIGNyb3NzLXRhYnVsYXRpb24gbWF0cml4ICh0aGF0IGNvcnJlc3BvbmQgdG8gc3RhdGVzKToKCmBgYHtyfQptYXRTREMgPC0geHRhYnMoIENhc2VzIH4gU3RhdGUgKyBEYXRlLCBkZk5ZRGF0YVN0YXRlcywgc3BhcnNlID0gVFJVRSkKZDNoZWF0bWFwOjpkM2hlYXRtYXAoIGxvZzEwKG1hdFNEQysxKSwgY2VsbG5vdGUgPSBhcy5tYXRyaXgobWF0U0RDKSwgc2NhbGUgPSAibm9uZSIsIGRlbmRyb2dyYW0gPSAicm93IiwgY29sb3JzID0gIkJsdWVzIikgIywgdGhlbWUgPSAiZGFyayIpCmBgYAoKCiMjIERlYXRocwoKQ3Jvc3MtdGFidWxhdGUgc3RhdGVzIHdpdGggZGF0ZXMgb3ZlciBkZWF0aHMgYW5kIHBsb3Q6CgoKYGBge3J9Cm1hdFNERCA8LSB4dGFicyggRGVhdGhzIH4gU3RhdGUgKyBEYXRlLCBkZk5ZRGF0YVN0YXRlcywgc3BhcnNlID0gVFJVRSkKZDNoZWF0bWFwOjpkM2hlYXRtYXAoIGxvZzEwKG1hdFNERCsxKSwgY2VsbG5vdGUgPSBhcy5tYXRyaXgobWF0U0REKSwgc2NhbGUgPSAibm9uZSIsIGRlbmRyb2dyYW0gPSAicm93IiwgY29sb3JzID0gIkJsdWVzIikgIywgdGhlbWUgPSAiZGFyayIpCmBgYAoKIyBUaW1lIHNlcmllcwoKYGBge3J9CmRmUXVlcnkgPC0gCiAgZGZOWURhdGFTdGF0ZXMgJT4lIAogIGRwbHlyOjpncm91cF9ieSggRGF0ZSwgRGF0ZU9iamVjdCApICU+JSAKICBkcGx5cjo6c3VtbWFyaXNlX2F0KCAudmFycyA9IGMoIkNhc2VzIiwgIkRlYXRocyIpLCAuZnVucyA9IHN1bSApCmRmUXVlcnlMb25nRm9ybSA8LSB0aWR5cjo6cGl2b3RfbG9uZ2VyKCBkZlF1ZXJ5LCBjb2xzID0gYygiQ2FzZXMiLCAiRGVhdGhzIiksIG5hbWVzX3RvID0gIlZhcmlhYmxlIiwgdmFsdWVzX3RvID0gIlZhbHVlIikKZGZRdWVyeUxvbmdGb3JtCmBgYAoKCmBgYHtyfQpnZ3Bsb3QoZGZRdWVyeUxvbmdGb3JtKSArCiAgZ2VvbV9saW5lKCBhZXMoIHggPSBEYXRlT2JqZWN0LCB5ID0gbG9nMTAoVmFsdWUpICkgKSArCiAgZmFjZXRfd3JhcCggflZhcmlhYmxlLCBuY29sID0gMSApCmBgYAoKIyMgQ2FzZXMKCmBgYHtyfQpmaXQgPC0gZm9yZWNhc3Q6OmF1dG8uYXJpbWEoIGRmUXVlcnkkQ2FzZXMgKQpmaXQKYGBgCgpgYGB7cn0KcGxvdCggZm9yZWNhc3Q6OmZvcmVjYXN0KGZpdCwgaCA9IDIwKSApCmdyaWQobnggPSBOVUxMLCBueSA9IE5VTEwsIGNvbCA9ICJsaWdodGdyYXkiLCBsdHkgPSAiZG90dGVkIikKYGBgCgojIyBEZWF0aHMKCmBgYHtyfQpmaXQgPC0gZm9yZWNhc3Q6OmF1dG8uYXJpbWEoIGRmUXVlcnkkRGVhdGhzICkKZml0CmBgYAoKYGBge3J9CnBsb3QoIGZvcmVjYXN0Ojpmb3JlY2FzdChmaXQsIGggPSAyMCkgKQpncmlkKG54ID0gTlVMTCwgbnkgPSBOVUxMLCBjb2wgPSAibGlnaHRncmF5IiwgbHR5ID0gImRvdHRlZCIpCmBgYA==